set more 1
clear

/*
File:	cepr_org_master.do
Date:	Mar 21, 2011, CEPR ORG Version 1.6.1
	Jan 19, 2012, CEPR ORG Version 1.7
	Oct 16, 2012, CEPR ORG Version 1.7.1
	Jan 28, 2013, CEPR ORG Version 1.8
	Dec 20, 2013
	Jan 14, 2014, CEPR ORG Version 1.9
	Sep  8, 2014, CEPR ORG Version 1.9.1
	Mar 12, 2015, CEPR ORG Version 2.0
	Apr  1, 2015, CEPR ORG Version 2.0.1
	Mar  1, 2016, CEPR ORG Version 2.1
	Oct 12, 2016, CEPR ORG Version 2.1.1
	Feb  9, 2017, CEPR ORG Version 2.2
	Apr 21, 2017, CEPR ORG Version 2.2.1
	Apr 10, 2018, CEPR ORG Version 2.3
	Mar 22, 2019, CEPR ORG Version 2.4
	Jul 24, 2019, CEPR ORG Version 2.4.1
	Feb 05, 2020, CEPR ORG Version 2.5

Desc:	Master do-file for creating CEPR consistent extract of CPS ORG
Note:	See copyright notice at the end of this program.
*/

/* Acknowledgements

The CEPR ORG extract uses the NBER's "CPS Labor Extracts 1979 - 1993"
as a base. Complete details and all underlying data are available from 
the NBER (www.nber.org). We are grateful to Jean Roth for assistance 
with the data and to Jean Roth and others at the NBER for their efforts 
to make the CPS data, their programs, and their documentation widely 
available. We are also grateful to Jared Bernstein, Danielle Gao, 
Larry Mishel, David Webster, and others at the Economic
Policy Institute (www.epi.org) for extensive conversations and
assistance over the years with the ORG data. We also thank Dean Baker, 
Annette Bernhardt, Heather Boushey, Julien Champagne, and Helene Jorgensen 
for extensive feedback.

John Schmitt, Ben Zipperer, Hye Jin Rho, Janelle Jones, Cherrie Bucknor, 
Brian Dew, and Hayley Brown

*/

/* Notice

The underlying Current Population Survey data referenced here are in the
public domain. This program and related programs are distributed under the
GNU General Public License (GPL). See end of this file and 
http://www.gnu.org/licenses/ for details.

*/

/* Part 1: set directories */

	/*Windows vs. GNU/Linux*/
global gnulin = 0 /*Set gnulin=0 if you run Windows; 1 if GNU/Linux*/
global version "2.5.ch"

	/* Windows users will need to download Unix command line utilities 
	  to add the gzip command to the Command Prompt. */

if $gnulin==1 {
global do "$CEPR/CPS_ORG/CEPR/Do" /* do files for NBER extracts */
global locbdo "$CEPR/CPS_Basic/CEPR/DoFiles" /* do files for Basic CPS */
global locin "$CEPR/CPS_ORG/NBER" /* original NBER ORG extracts */
global locbas "$CEPR/CPS_Basic/NBER" /* Basic CPS extracts */
global loctmp "$CEPR/CPS_ORG/CEPR/temp" /* temporary files */
global locout "$US_Ineq_Repl/Processed/CEPR" /* final CEPR extracts */

	/*executables: ensure your gzip and unzip point to the correct folder*/
global gzip "/usr/bin/gzip"
global unzip "/usr/bin/unzip"
global copy "/bin/cp"
global erase "/bin/rm"
}

if $gnulin==0 {
global do "$CEPR/CPS_ORG/CEPR/Do" /* do files for NBER extracts */
global locbdo "$CEPR/CPS_Basic/CEPR/DoFiles" /* do files for Basic CPS */
global locin "$CEPR/CPS_ORG/NBER" /* original NBER ORG extracts */
global locbas "$CEPR/CPS_Basic/NBER" /* Basic CPS extracts */
global loctmp "$CEPR/CPS_ORG/CEPR/temp" /* temporary files */
global locout "$US_Ineq_Repl/Processed/CEPR" /* final CEPR extracts */

	/*executables: ensure your gzip and unzip point to the correct folder*/
global gzip "C:/cygwin64/bin/gzip.exe"  
global unzip "C:/Program Files (x86)/GnuWin32/bin/gunzip"
global copy "copy"
global erase "erase"
}

/* Part 2: some preliminaries 
Do not re-run for testing. 
*/

/* a. convert original NBER files from 2-digit to 4-digit years */
cd "$do"
do "cepr_org_y2k.do"


/* b. read raw data 1994- */

cd "$locbdo"
do "cepr_basic_read_all.do" 	/*	must change this when adding new months of data */		
/* program switches */

/* run only once */
b1994 /* read 1994 data months 1-12 */
b1995 /* read 1995 data months 1-12 */
b1996 /* read 1996 data months 1-12 */
b1997 /* read 1997 data months 1-12 */
b1998 /* read 1998 data months 1-12 */
b1999 /* read 1999 data months 1-12 */
b2000 /* read 2000 data months 1-12 */
b2001 /* read 2001 data months 1-12 */
b2002 /* read 2002 data months 1-12 */
b2003 /* read 2003 data months 1-12 */
b2004 /* read 2004 data months 1-12 */
b2005 /* read 2005 data months 1-12 */
b2006 /* read 2006 data months 1-12 */
b2007 /* read 2007 data months 1-12 */
b2008 /* read 2008 data months 1-12 */
b2009 /* read 2009 data months 1-12 */
b2010 /* read 2010 data months 1-12 */
b2011 /* read 2011 data months 1-12 */
b2012 /* read 2012 data months 1-12 */
b2013 /* read 2013 data months 1-12 */
b2014 /* read 2014 data months 1-12 */
b2015 /* read 2015 data months 1-12 */
b2016 /* read 2016 data months 1-12 */
b2017 /* read 2017 data months 1-12 */
b2018 /* read 2018 data months 1-12 */
b2019 /* read 2019 data months 1-12 */
b2020 /* read 2019 data months 1-12 */

/* c. convert monthly files into annual (or near annual) data sets */

capture program drop fullyr
program define fullyr
version 7.0
*		fullyr `1'	`2'		`3'	
* syntax 	fullyr datayear firstmonth lastmonth
*
while "`1'"~="" {
   local month=`2'
   while `month'<=`3' {
if $gnulin==1 {
use "$locbas/`1'/cps_basic_raw_`1'_`month'.dta", clear
}
if $gnulin==0 {
use "$locbas\\`1'\cps_basic_raw_`1'_`month'.dta", clear
}
keep if hrmis==4 | hrmis==8 /* keep only ORG observations */
keep if 16<=peage & peage~=. 
drop if pwsswgt<0 /* drop if observation has missing CPS weight */
drop if pworwgt<0 /* drop if observation has missing ORG weight */
lab var year "Year"
notes: Age 16 and older only
*
compress
*
cd "$locin" /* save with NBER extracts */
saveold "cepr_org_`1'_`month'.dta", replace
   local month=`month'+1	
   }
mac shift 3
}
end

capture program drop combcps
program define combcps
version 7.1
*			combcps `1'		 `2'		`3'	
* syntax 	combcps datayear firstmonth lastmonth
*
while "`1'"~="" {
   local month=`2'
   cd "$locin" /* saved with NBER extracts */
   use "cepr_org_`1'_`month'.dta", clear
   local month=`month'+1
   while `month'<=`3' {
   cd "$locin"
   append using "cepr_org_`1'_`month'.dta"
   local month=`month'+1
   }
   cd "$locin"
   saveold "morg`1'.dta", replace
   cd "$locin"
   forvalues i = `2'(1)`3'{
		!$erase "cepr_org_`1'_`i'.dta"
	}
mac shift 3
}
end

/* Part 3: create consistent set of core variables, by topic 
*/

cd "$do"
do "cepr_org_keepord.do" /* load program to keep and order output */

/* Part 3.a. for 1979-1993, use NBER ORG extract */

capture program drop orgnber
program define orgnber
version 7.0
*
* for 1979-1993 CEPR ORG extract, based on NBER extract
*
while "`1'"~="" {
*
cd "$locin"
use "morg`1'.dta", clear
label drop _all
*
lab var year "Year"
notes: Age 16 and older only
*
cd "$do"
do "cepr_org_idvar.do"
do "cepr_org_demog.do"
do "cepr_org_family.do"
do "cepr_org_empstat.do"
do "cepr_org_geog.do"
do "cepr_org_educ.do"
do "cepr_org_ind.do"
do "cepr_org_occ.do"
do "cepr_org_hours.do"
do "cepr_org_topcode_lognormal.do"
do "cepr_org_wages.do"
*
keepord /* keeps and orders consistent variables */
*
compress
sortit
lab data "CEPR ORG Extract, Version $version, `1'"
cd "$locout"
saveold "cepr_org_`1'.dta", replace


mac shift
}
end

/* Part 3.b. for 1994-, use monthly CPS Basic files */

capture program drop orgcpsb
program define orgcpsb
version 7.0
*
* from 1994 CEPR ORG extract, based on monthly CPS Basic files
*
while "`1'"~="" {
*
cd "$locin"
use "morg`1'.dta", clear
lab var year "Year" 
notes: Age 16 and older only
*
cd "$locbdo"
do "cepr_basic_idvar.do" 
do "cepr_basic_demog.do"
do "cepr_basic_family.do"
do "cepr_basic_empstat.do"
do "cepr_basic_geog.do"
do "cepr_basic_educ.do"
do "cepr_basic_ind.do"
do "cepr_basic_occ.do"
do "cepr_basic_hours.do"
do "cepr_basic_topcode_lognormal.do"
cd "$do"
do "cepr_org_wages.do"
do "cepr_org_keepord.do"
keepord /* keeps and orders consistent variables */
*
compress
sortit
lab data "CEPR ORG Extract, Version $version, `1'"
cd "$locout"
saveold "cepr_org_`1'.dta", replace 

mac shift
}
end

/* program switches */

	/* 1979 - 1993 NBER */
	
orgnber 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 /*
*/ 1990 1991 1992 1993



	/* 1994 - 2020 CPS Basic */

		 /* read raw CPS data */

fullyr 1994 1 12 1995 1 12 1996 1 12 1997 1 12 1998 1 12 1999 1 12/*
 */ 2000 1 12 2001 1 12 2002 1 12 2003 1 12 2004 1 12 2005 1 12 /*
 */ 2006 1 12 2007 1 12 2008 1 12 2009 1 12 2010 1 12 2011 1 12 /*
 */ 2012 1 12 2013 1 12 2014 1 12 2015 1 12 2016 1 12 2017 1 12 /*
 */ 2018 1 12 2019 1 12 2020 1 12


combcps 1994 1 12 1995 1 12 1996 1 12 1997 1 12 1998 1 12 1999 1 12/*
 */ 2000 1 12 2001 1 12 2002 1 12 2003 1 12 2004 1 12 2005 1 12 /*
 */ 2006 1 12 2007 1 12 2008 1 12 2009 1 12 2010 1 12 2011 1 12 /*
 */ 2012 1 12 2013 1 12 2014 1 12 2015 1 12 2016 1 12 2017 1 12 /*
 */ 2018 1 12 2019 1 12 2020 1 12


		/* process ORG data from Basic CPS */

orgcpsb 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 /*
*/ 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 /*
*/ 2017 2018 2019 2020


/* Release notes

2.5 Feb 05, 2020
		1. Added 2019 data
		2. Update to CPI-U-RS price adjustment values
		3. Corrected rw_ot labeling
2.4.1 July 24, 2019
		1. Corrected coding errors for selfemp, selfinc, pubsect, pubfed, pubst, publoc
		2. Update to CPI-U-RS price adjustment values
2.4	March 22, 2019
		1. Added 2018 data
		2. Addressed issue with gzip for Windows users
		3. Update to CPI-U-RS price adjustment values
2.3	April 10, 2018
		1. Added 2017 data
		2. Added racea variable "Detailed Asian origin"
		3. Added variables cert and certgov to identify occupational licensing 
		   and whether government issued.
		4. Fixed error that excluded unmarried reference persons from variables
		   ownchild, ch05, ch613, ch1417 for the years 1984 to 1993.
		5. Update to many CPI-U-RS price adjustment values from 1979 to present.
2.2.1   April 21, 2017
        	1. Added new variable: hourslwm "hours last week, main job"
2.2     February 9, 2017
        	1. Added 2016 data
        	2. Fixed error in w_ln_no and rw for nonhourly workers with hours that
           	vary in 2015
        	3. Added cow1 - class of worker for first job
        	4. Expanded age range for school enrollment variables 2013-present;
           	now ages 16-54
        	5. Deleted data for 1999 for following variables: ownchild, ch02, ch05,
           	ch35, ch613, ch1417
        	6. Fixed error that had cmsacode14 missing in 2015
2.1.1	October 12, 2016
		1. Added cow2 - class of worker for second job
2.1		March 2, 2016
		1. Added 2015 data
		2. Added faminc variable "Family income band"
		3. Renamed cmsacode to cmsacode05, added new cmsacode14 variable
			starting in May 2014
		4. Created new smsastat14 variable to incorporate changes to
			FIPS metro area codes in May 2014
		5. Fixed error in educ92 for 2002
2.0.1	April 1, 2015
		1. fixed error in hhid2, hrsample, hrsersuf, and hhnum variables
2.0		March 12, 2015
		1. Added 2014 data
		2. Now uses NBER MORG extract only for 1979-1993; previously was 1979-2002
			Uses CPS Basic from 1994-present
		3. Corrected coding error in 2012 for race variables: wbho, wbhao, wbhom,
			wbhaom, racehpia
		4. Corrected coding error for vet in 2005
		5. Corrected coding error for selfemp, selfinc, pubsect, pubfed, pubst, publoc
		6. Dropped student, studpt variables
		7. Added new school enrollment variables: schenrl, schhs, schcol, schft, schpt
		8. Added multjob variable - has more than one job
		9. Added multjobn variable - number of jobs
		10. Added paid employees info variables: pdemp1, pdemp2, nmemp1, nmemp2
		11. Corrected coding error in ownchild, ch02, ch05, ch35, ch614, ch1417
		12. Corrected coding error in famrel94
		13. Corrected coding error in 2004 for metro, centcity, suburb, rural
		14. Extended principalcty variable back to 1994
		15. Extended fipscountry and cbsasz back to Sep 1995
		16. Renamed smsastat06 to smsastat05, and extended back to 2005
		17. Dropped chi variable for Chicago
		18. Corrected coding error in nyc, and la variables
		19. Corrected coding error in hourslwa
		20. Dropped hrmgfail - no longer nec. since we use CPS Basic from 94-on
		21. Corrected coding error in blsimpt
		22. uhourse: for years 94-present, now uses pehrusl1 for non-hourly, and peernhro
			for hourly workers
		23. Added longitudinal weight (lonwgt) and family weight (famwgt) to keepord program
		24. Added ind_m03 variable "major industry recode" for 2003-present
		25. Dropped ind11
		26. Added ind09, which is valid from Jan 09-April 2012
		27. Corrected coding error in ind12 - now available May 2012-December 2013
		28. Added ind14 variable
		29. Dropped occ13 variable
		30. Added occ12, which is valid May 2012-present
		31. Corrected coding error in occ11, now available Jan 2011-April 2012.
		32. Added occ_m03 variable "major occupation recode" for 2003-present
		33. Added peernuot to keepord program
		34. Dropped earnhre
		35. Corrected coding error in blsimph and blsimpw
		36. Added wage1, hourly earnings if paid by the hour, excluding otc
		37. Added weekpay, usual weekly earnings for hourly and non-hourly workers, including otc
		38. Added wage2, usual hourly earnings for nonhourly workers, including otc
		39. Added wage3, nber-style wage variable for usual hourly earnings, excluding otc for
			hourly workers, but including otc for nonhourly workers
		40. Added otcrec - usually receives otc. Different methodology for 79-93 vs. 94-present
		41. Added otcamt (formerly wkotc) - weekly earnings from otc
		42. Dropped wkotc
		43. Added wage4 - usual hourly earnings for hourly and nonhourly workers, including otc
		44. Dropped the following wage variables: w_nber, w_no_no, w_no_ot, w_ln_no, w_ln_ot, w_p7_no,
			w_p7_ot, w_p8_no, w_p8_ot, w_p9_no, w_p9_ot, rw_p8_no, rw_p8_ot, w_ln_noa, w_ln_ota, 
			w_p7_noa, w_p7_ota, w_p8_noa, w_p8_ota, w_p9_noa, w_p9_ota, rwa, rw_ota, rw_p8_noa,
			rw_p8_ota
		45. Changed rw and rw_ot, trims wage observations below $0.50 and above $200 in 1989$
		46. Added proxy "Self or proxy response", replacing lf_proxy
		47. Added wholine "Line number of respondent", replacing resp_lno
		48. Added reltoref ""Relationship to reference person", replacing rel_ref
		49. Dropped lf_proxy, resp_lno, rel_refp
		50. Uses newer version of NBER MORG extract for 1979-1993 (accessed July 2014)

1.9.1	Sep 8, 2014
		1. Added missing why3594 to years 2003-2013
1.9 	Feb 26, 2014	
		1. Added 2013 data
1.8 	Jan 28, 2013
		1. Added 2012 data
		2. Added abpaid for 1994-
		3. Variables cbsasz and cmsacode have been updated for 2005-
		4. Added smsastat06 variable for 2006- 
		5. w_no_ot corrected for 1979-1988; many thanks to Julien Champagne
1.7.2 	Oct 31, 2012
		1. Corrected coding error in occ11 variable
1.7.1 	Oct 16, 2012
		1. Added 3-digit occupation variable, occ11, for 2011 
1.7   	Jan 19, 2012
		1. Added 2011 data
1.6.1 	Mar 17, 2010
		1. Corrected minor coding error in ind_2d variable
		2. Corrected notes for rw variable and adjusted notes for ownchild
1.6   	Feb 22, 2011
		1. Added 2010 data
		2. Added unemdur (unemployment duration) variable for 1994-
		3. Added jobloser, jobleaver, entrant variables for 1994-
		4. Raised the upper-bound for "trimmed" wage data in preferred
			CEPR wage series to a $200 per hour in constant 1989 dollars
		5. Calculated separate estimated means above the top-code by gender
			for weekly earnings, and used these for preferred CEPR wage
			series
		6. Improved merging process of Basic CPS (1994-2002) with NBER data.
		7. Rounded imputed hours to nearest full hour
		8. Added hhid2, hrsample, hrsersuf; sorted data to facilitate merging 
			with outside data.
		9. Restored smsastat (city codes) from 2006-
		10. Added racehpia variable for 2003-
1.5.1 	May 26, 2010
		1. Corrected coding error in nilf variable
1.5   	Jan 4, 2010
		1. Added 2009 data; converted real wages to $2009
1.4.1	Oct 27, 2009
		1. Added wbhao and wbhaom variables, which include separate 
			Asian Pacific race/ethnicity category from 1988 and a
			separate Native American category from 2003
1.4   	Feb 12, 2009
		1. Added 2008 data
		2. Added metropolitan-area identifiers for New York, Los Angeles, 
			and Chicago for 2005-2008
		3. Added two new real-wage variables
1.3.1 	Oct 29, 2008
		1. Added educ92 variable
1.3   	Feb 15, 2008
		1. Added 2007 data
		2. Added ownchild variable for 2006 (previously missing)
		3. Adjusted immigrant "years in US" (prinusyr) for 2006
		4. Added additional income bands for faminc variable.
1.2.2 	Oct 12, 2007
		1. Corrected error in decimal places in cepr_basic_topcode_lognormal.do
			and cepr_basic_topcode_pareto.do affecting topcoded wages in
			2003-2006
1.2.1 	Jul 24, 2007
		1. Temporarily removed substate geography variables 2004-2006.
		2. Corrected hhnum for 2004+ (thanks Jeff Wenger; previously 
			incorrect, due to dictionary change in May 2004)
		3. Added proxy reporting variables for later years.

1.2 	Jan 24, 2007
		1. Added Jan-Aug 2006 data
		2. Corrected nilf variable in cepr_org_empstat.do
			(thanks Jeff Wenger)
		3. Corrected vet, rural, centcity, suburb variables for months
			8-12 of 2005 (error due to dictionary change in Aug 2005)
		4. Corrected wbho for August-December 2005 (previously missing,
			(due to dictionary change in Aug 2005)
		5. Corrected missing industry variables for 2003+

1.1		Feb 27, 2006
		1.	Added 2005 data
		2.	Converted programs for dual GNU/Linux, Windows use
		3.	Changed procedure for adding new months of CPS
		4.	Added refper (reference person) variable, 1994-2005
		5.	Set procedure for trimming real wages (rw, rw_ot) to fix
			cutoffs at $1 and $100 in constant 1989 dollars
		6.	Shortened label length of some industry/occupation labels
			to satisfy 80-column label requirement for Stata 8.
0.96	Nov, 2005
		1. 	Updated to include 2003, 2004 data from CPS Basic files (not 
			NBER ORG extract as in earlier years)
		2. 	Manufacturing (manuf) variable now excludes construction; 
			service (servs) variable now includes construction
		3. 	Added new industry (ind03, ind_2d) and occupation codes 
			(occ03, docc03, manag03) to reflect switch in Jan 2003 CPS
			from SIC 1987 to NAICS 2002 and from SOC 1980 to SOC 2000; 
			variable manag83 replaces previous variable manager
		4. 	Correction to children's age variables in 1998 and 1999 in
			cepr_org_family.do
		5. 	Correction to label for famrel94 in cepr_org_family.do
		6. 	Correction to coding of publoc in all years in 
			cepr_org_empstat.do
		7. 	Added trimmed, top-code-adjusted, real wage variables 
			(rw, rw_ot)
		8. 	Changed named of year-arrived-in-US variable from peinusyr 
			to prinusyr
		9. 	Added new race and ethnicity variable (wbhom), to reflect 
			changes in CPS race and ethnicity options beginning in 2003
		10. 	Correction to agric variable in cepr_org_ind.do
	
0.91	Nov 8, 2003
		All NBER city-related codes added, without labeling or
		documentation to basic extracts
		
0.9		Sep 1, 2003
		Beta release
*/
 
/* 
Copyright 2020 CEPR and John Schmitt

Center for Economic and Policy Research
1611 Connecticut Avenue, NW Suite 400
Washington, DC 20009
Tel: (202) 293-5380
Fax: (202) 588-1356
http://www.cepr.net

This program and all programs referenced in it are free software. You
can redistribute the program or modify it under the terms of the GNU
General Public License as published by the Free Software Foundation;
either version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
USA.
*/
